{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "47c63171-a4b6-49fb-a26d-531adc1464f8",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "# Yellow Taxy NYC\n",
    "\n",
    "### Data Profiling in Databricks with ydata-profiling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "18fdccc1-4779-4062-981a-882fa043538c",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "Read a Delta table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "9f0e1f63-717a-4653-98fc-bb5119ae9c63",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "input_table_name = \"default.yellowtaxi_trips\"\n",
    "df = spark.table(input_table_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "21da1ee5-9055-4995-b777-f2081f91e7d4",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "display(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "565d9304-1f65-42c1-91a3-01f182d44cd2",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "## Data profiling with YData Profiling\n",
    "\n",
    "pandas-profiling is now ydata-profiling and includes support for Spark dataframes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "eebb3d44-b088-4f45-b55b-6bd527ea7f7a",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "from ydata_profiling import ProfileReport\n",
    "\n",
    "report = ProfileReport(\n",
    "    df,\n",
    "    title=\"NYC yellow taxi trip\",\n",
    "    infer_dtypes=False,\n",
    "    interactions=None,\n",
    "    missing_diagrams=None,\n",
    "    correlations={\n",
    "        \"auto\": {\"calculate\": False},\n",
    "        \"pearson\": {\"calculate\": True},\n",
    "        \"spearman\": {\"calculate\": True},\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "b4a28624-1b4f-4879-a4a4-cf34ec9e6bc0",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "####Display as an HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "eefeac1e-2d58-4d23-a0ec-46b1d69e44bf",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "# Export the report as html and display\n",
    "report_html = report.to_html()\n",
    "displayHTML(report_html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "0a7fbe9c-7b0f-460f-99d7-552736c1091c",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "#### Extract the profile as JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "cd3f32fc-344d-4a93-b037-a56952903979",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "profile_json = report.to_json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "4c569eaa-5cb5-471c-9c7c-cb7d9f7cd1f5",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "profile_json"
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "dashboards": [],
   "language": "python",
   "notebookMetadata": {
    "mostRecentlyExecutedCommandWithImplicitDF": {
     "commandId": 2648559141144570,
     "dataframes": [
      "_sqldf"
     ]
    },
    "pythonIndentUnit": 4
   },
   "notebookName": "YData-profiling in Databricks",
   "notebookOrigID": 329200988581789,
   "widgets": {}
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
